import hashlib
import json
import os.path
from urllib.request import urlretrieve

import pandas as pd
import requests
from bs4 import BeautifulSoup

from utils import Download

with open("./genomeasia100k.json", 'r') as f:
    data_source = json.load(f)
baseurl = data_source['baseurl']

response = requests.get(url=baseurl, verify=False, timeout=5)
if response.status_code != 200:
    raise ValueError

soup = BeautifulSoup(response.text, 'html.parser')
tag_a = soup.find_all('tbody')[1].find_all('a')

# # 获取新的md5列表
# new_md5 = pd.read_table('https://browser.genomeasia100k.org' + tag_a[-1]['href'])
# new_md5 = new_md5.values

filelist = []
for a in tag_a[:-1]:
    url = 'https://browser.genomeasia100k.org' + a['href']
    filename = a.text.strip()

    filetype = os.path.splitext(filename)[1]
    if filetype != '.gz':
        print(filetype)
        continue
    filelist.append({
        'url': url,
        'filename': filename
    })

for index, file in enumerate(filelist):
    url = file['url']
    filename = file['filename']
    path = os.path.join('download', 'genomeasia100k', filename)

    # # 如果md5值一样，直接跳过
    # if new_md5[index][1] == data_source['subfiles'][index]['md5']:
    #     continue


    # 下载文件
    download = Download(url, fileName=filename, dirName='genomeasia100k')
    md5 = download.start()

    filelist[index]['path'] = path
    filelist[index]['md5'] = md5
else:
    data_source['subfiles'] = filelist

    with open('./genomeasia100k.json', 'w') as f:
        json.dump(data_source, f)
